Decription of article

!!- Decription of dataset Gene expression analyses in leukocytes sorted from normal breast tissues, ductal carcinomas in situ (DCIS), and HER2+ and triple negative invasive ductal carcinomas (IDC) Source name resected breast tumor Organism Homo sapiens Characteristics tissue of origin: breast (tumor) cell type: CD45+CD3+ T cell breast tumor subtype: DCIS

RNA was isolated from purified CD45+CD3+ leukocytes by cell sorting

Results in A1 !! normalization - from the slides Statistics: - MDS Plot - Density Plot - Boxplot

Prework

Install all the required packages

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
if (!requireNamespace("Biobase", quietly = TRUE))
    BiocManager::install("Biobase")
if (!requireNamespace("ComplexHeatmap", quietly = TRUE))
    BiocManager::install("ComplexHeatmap")
if (!requireNamespace("circlize", quietly = TRUE))
    BiocManager::install("circlize")
if (!requireNamespace("gprofiler2", quietly = TRUE))
    install.packages("gprofiler2")

Reference the packages

  library(Biobase)
  library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.2.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## ========================================
  library(circlize)
## ========================================
## circlize version 0.4.8
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: http://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization 
##   in R. Bioinformatics 2014.
## ========================================
  library(gprofiler2)

Gene of interest

Look at the most common genes of the same subfamily in our dataset

#Find the genes of interest
#Start with strongest signals
#The most common genes of the same subfamily
rownames(exp_filtered) <- exp_filtered$Feature
genes_sample <- data.frame(lapply(rownames(normalized_data), FUN = function(x){unlist(strsplit(x, "(?=[A-Za-z])(?<=[0-9])|(?=[0-9])(?<=[A-Za-z])", perl=TRUE))[c(1,2)]}))
colnames(genes_sample) <- colnames(exp_filtered)[2:42]
rownames(genes_sample) <- c("subfamily", "num")
genes_sample <- data.frame(t(genes_sample))

summarized_gene_counts <- sort(table(genes_sample$subfamily),decreasing = TRUE)

head(summarized_gene_counts)
## 
##  LOC LINC  ZNF    C  SLC  FAM 
## 1355  632  522  477  380  300

!! LOC - family of RNA gene - not too interesting CD - CD8A CTLA-4 signaling pathways - mentioned in the results of the article multiple times

#CD8A
dcis_samples <- grep(colnames(normalized_data),
                          pattern="DCIS")
idc_samples <- grep(colnames(normalized_data),
                          pattern="IDC")
normal_samples <- grep(colnames(normalized_data),
                    pattern="N")
gene_of_interest <- "CD8A"

CD8A_dcis_samples <- data.matrix(normalized_data
                       [gene_of_interest,
                         dcis_samples])
colnames(CD8A_dcis_samples) <- c("dcis_samples")
CD8A_idc_samples <- data.matrix(normalized_data
                       [gene_of_interest,
                         idc_samples])
colnames(CD8A_idc_samples) <- c("idc_samples")
CD8A_normal_samples <- data.matrix(normalized_data
                       [gene_of_interest,
                         normal_samples])
colnames(CD8A_normal_samples) <- c("normal_samples")

# DCIS VS Normal
t.test(x=t(CD8A_dcis_samples),y=t(CD8A_normal_samples))
## 
##  Welch Two Sample t-test
## 
## data:  t(CD8A_dcis_samples) and t(CD8A_normal_samples)
## t = -0.92393, df = 17.895, p-value = 0.3678
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.4529180  0.9548865
## sample estimates:
## mean of x mean of y 
##  4.331695  5.080711
# IDC vs Normal
t.test(x=t(CD8A_idc_samples),y=t(CD8A_normal_samples))
## 
##  Welch Two Sample t-test
## 
## data:  t(CD8A_idc_samples) and t(CD8A_normal_samples)
## t = -3.8856, df = 26.022, p-value = 0.0006289
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.583839 -1.104006
## sample estimates:
## mean of x mean of y 
##  2.736788  5.080711
#CTLA4
gene_of_interest <- "CTLA4"

CTLA4_dcis_samples <- data.matrix(normalized_data
                                 [gene_of_interest,
                                   dcis_samples])
colnames(CTLA4_dcis_samples) <- c("dcis_samples")
CTLA4_idc_samples <- data.matrix(normalized_data
                                [gene_of_interest,
                                  idc_samples])
colnames(CTLA4_idc_samples) <- c("idc_samples")
CTLA4_normal_samples <- data.matrix(normalized_data
                                   [gene_of_interest,
                                     normal_samples])
colnames(CTLA4_normal_samples) <- c("normal_samples")

# DCIS VS Normal
t.test(x=t(CTLA4_dcis_samples),y=t(CTLA4_normal_samples))
## 
##  Welch Two Sample t-test
## 
## data:  t(CTLA4_dcis_samples) and t(CTLA4_normal_samples)
## t = 1.4857, df = 13.051, p-value = 0.1611
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5580282  3.0188473
## sample estimates:
## mean of x mean of y 
##  2.339701  1.109291
# IDC vs Normal
t.test(x=t(CTLA4_idc_samples),y=t(CTLA4_normal_samples))
## 
##  Welch Two Sample t-test
## 
## data:  t(CTLA4_idc_samples) and t(CTLA4_normal_samples)
## t = 3.5966, df = 11.811, p-value = 0.003758
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.475537 6.031031
## sample estimates:
## mean of x mean of y 
##  4.862575  1.109291

Differential expression analysis

Look at the article and get the conclusion What does P-value do? P-value = 0.05 and multiple hypothesis correction on the p-values ?

Quasi liklihood DEF - used for more complicated models and is highly recommended for bulk RNASeq experiments. (glmQLFTest)

Design the model

subtypes <- samples$subtype
model_design <- model.matrix(~ 0 + subtypes)
expressionMatrix <- as.matrix(normalized_data[,1:41])
rownames(expressionMatrix) <- rownames(normalized_data)
colnames(expressionMatrix) <- colnames(normalized_data)[1:41]
minSet <- ExpressionSet(assayData=expressionMatrix)

#Fit the model
fit <- lmFit(minSet, model_design)

#Normalize the data
dm <-  as.matrix(exp_filtered[,2:42])
rownames(dm) <- exp_filtered$Feature
d <- DGEList(counts=dm, group=samples$subtype)

d <- calcNormFactors(d)

#MDS Plot

plotMDS(d, labels=NULL, pch = 1, 
        col= c("darkgreen","blue","red")[factor(samples$subtype)])
legend("topright", 
       legend=levels(factor(samples$subtype)), 
       pch=c(1), col= c("darkgreen","blue","red"),
       title="Class",  
       bty = 'n', cex = 0.5)

#Estimate Dispersion - our model design.
d <- estimateDisp(d, model_design)

#Fit the model
fit <- glmQLFit(d, model_design)


#Calculate differential expression
#DCIS vs Normal
contrast_dcisVSnormal <- makeContrasts(
  dcisVSnormal ="subtypesDCIS-subtypesN",
  levels=model_design)

#IDC vs Normal
contrast_idcVSnormal <- makeContrasts(
  dcisVSnormal ="subtypesIDC-subtypesN",
  levels=model_design)

#Normal vs all
contrast_normal <- makeContrasts(
  immunovsrest ="subtypesN-(subtypesDCIS + 
  subtypesIDC)/2",
  levels=model_design)



#Get all the results
qlf.dcis_vs_n <- glmQLFTest(fit, contrast=contrast_dcisVSnormal)
tt_dcis_vs_n <- topTags(qlf.dcis_vs_n,n=nrow(d))
qlf.idc_vs_n <- glmQLFTest(fit, contrast=contrast_idcVSnormal)
tt_idc_vs_n <- topTags(qlf.idc_vs_n,n=nrow(d))
qlf.normal_vs_all <- glmQLFTest(fit, contrast=contrast_normal)
tt_normal_vs_all <- topTags(qlf.normal_vs_all,n=nrow(d))


#How many gene pass the threshold p-value < 0.05?
length(which(tt_dcis_vs_n$table$PValue<0.05))
## [1] 7130
length(which(tt_idc_vs_n$table$PValue<0.05))
## [1] 7401
length(which(tt_normal_vs_all$table$PValue<0.05))
## [1] 8066
#How many genes pass correction?
#using BH as adjusted method
length(which(tt_dcis_vs_n$table$FDR < 0.01))
## [1] 1346
length(which(tt_idc_vs_n$table$FDR < 0.01))
## [1] 1188
length(which(tt_idc_vs_n$table$FDR < 0.01))
## [1] 1188
#Heatmap
heatmap_matrix <- normalized_data[,1:ncol(normalized_data)]
rownames(heatmap_matrix) <- rownames(normalized_data)
colnames(heatmap_matrix) <- colnames(normalized_data[,1:ncol(normalized_data)])

top_hits <- rownames(tt_normal_vs_all)[which(tt_normal_vs_all$table$FDR < 0.01)]
heatmap_matrix_tophits <- t(
  scale(t(heatmap_matrix[which(rownames(heatmap_matrix) %in% top_hits),]))) 
if(min(heatmap_matrix_tophits) == 0){
  heatmap_col = colorRamp2(c( 0, max(heatmap_matrix_tophits)), 
                           c( "white", "red"))
} else {
  heatmap_col = colorRamp2(c(min(heatmap_matrix_tophits), 0, max(heatmap_matrix_tophits)), c("blue", "white", "red"))
}
current_heatmap <- Heatmap(as.matrix(heatmap_matrix_tophits),
                           cluster_rows = TRUE,  show_row_dend = TRUE,
                           cluster_columns = TRUE,show_column_dend = FALSE,
                           col=heatmap_col,show_column_names = FALSE, 
                           show_row_names = FALSE,show_heatmap_legend = TRUE)
n_colours <- c("darkgreen","blue","orange")
names(n_colours) <- unique(samples$subtype)
n <- HeatmapAnnotation(df=data.frame(
  type = samples$subtype),
  col =  list(type = n_colours))
current_heatmap <- Heatmap(as.matrix(heatmap_matrix_tophits),
                           cluster_rows = TRUE,  show_row_dend = TRUE,
                           cluster_columns = TRUE,show_column_dend = FALSE,
                           col=heatmap_col,show_column_names = FALSE, 
                           show_row_names = FALSE,show_heatmap_legend = TRUE,
                           top_annotation = n)
current_heatmap

Up regualted genes

diff_exp <- merge(exp[,1:1],tt_normal_vs_all, by.x=1, by.y = 0)
diff_exp[,"rank"] <- -log(diff_exp$PValue,base =10) * sign(diff_exp$logFC)
length(which(tt_normal_vs_all$table$PValue <= 0.05 
                       & tt_normal_vs_all$table$logFC >= 1))
## [1] 5942
up_genes <- diff_exp$x[which(diff_exp$PValue <= 0.05 & diff_exp$logFC >= 1)]

Down regulated genes

length(which(tt_normal_vs_all$table$PValue < 0.05 
                       & tt_normal_vs_all$table$logFC <= -1))
## [1] 2123
down_genes <- diff_exp$x[which(diff_exp$PValue < 0.05 & diff_exp$logFC <= -1)]

Save all genes lists

write.table(x=up_genes,
            file=file.path("up_genes.txt"),sep = "\t",
            row.names = FALSE,col.names = FALSE,quote = FALSE)
write.table(x=down_genes,
            file=file.path("down_genes.txt"),sep = "\t",
            row.names = FALSE,col.names = FALSE,quote = FALSE)
write.table(x=data.frame(genename= diff_exp$x,F_stat= diff_exp$rank),
            file=file.path("ranked_genelist.txt"),sep = "\t",
            row.names = FALSE,col.names = FALSE,quote = FALSE)

Over-representation analysis

up_gpro <- gost(up_genes, organism = "hsapiens", ordered_query = FALSE,
     multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
     measure_underrepresentation = FALSE, evcodes = FALSE,
     user_threshold = 0.05, correction_method = c("fdr"),
     domain_scope = c("annotated"),
     custom_bg = NULL, numeric_ns = "", sources = NULL,
     as_short_link = FALSE)

head(up_gpro$result)
##     query significant      p_value term_size query_size intersection_size
## 1 query_1        TRUE 8.599586e-05     16622       4383              3781
## 2 query_1        TRUE 8.599586e-05     16622       4383              3781
## 3 query_1        TRUE 1.950758e-04     13260       4383              3066
## 4 query_1        TRUE 1.950758e-04      4641       4383              1155
## 5 query_1        TRUE 2.033616e-04      8420       4383              2006
## 6 query_1        TRUE 3.662015e-04     14265       4383              3275
##   precision    recall     term_id source
## 1 0.8626512 0.2274696 TF:M01240_0     TF
## 2 0.8626512 0.2274696   TF:M01240     TF
## 3 0.6995209 0.2312217 TF:M01240_1     TF
## 4 0.2635181 0.2488688 TF:M09729_1     TF
## 5 0.4576774 0.2382423 TF:M04106_1     TF
## 6 0.7472051 0.2295829 TF:M00986_0     TF
##                                                term_name effective_domain_size
## 1           Factor: BEN; motif: CAGCGRNV; match class: 0                 19887
## 2                           Factor: BEN; motif: CAGCGRNV                 19887
## 3           Factor: BEN; motif: CAGCGRNV; match class: 1                 19887
## 4       Factor: LUMAN; motif: CYCAGCYYCY; match class: 1                 19887
## 5 Factor: RUNX2; motif: NRACCGCAAACCGCAN; match class: 1                 19887
## 6       Factor: Churchill; motif: CGGGNN; match class: 0                 19887
##   source_order     parents
## 1          429   TF:M01240
## 2          428   TF:M00000
## 3          430 TF:M01240_0
## 4         4072 TF:M09729_0
## 5         6112 TF:M04106_0
## 6          726   TF:M00986
#Plot
gostplot(up_gpro, capped = TRUE, interactive = TRUE)
#View all in a chart instead
publish_gosttable(up_gpro, highlight_terms = up_gpro$result[c(1:10),],
                  use_colors = TRUE, 
                  show_columns = c("source", "term_name", "term_size", "intersection_size"),
                  filename = NULL)

down_gpro <- gost(down_genes, organism = "hsapiens", ordered_query = FALSE,
                multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
                measure_underrepresentation = FALSE, evcodes = FALSE,
                user_threshold = 0.05, correction_method = c("fdr"),
                domain_scope = c("annotated"),
                custom_bg = NULL, numeric_ns = "", sources = NULL,
                as_short_link = FALSE)
head(down_gpro$result)
##     query significant      p_value term_size query_size intersection_size
## 1 query_1        TRUE 7.625172e-04      1121       1466               142
## 2 query_1        TRUE 2.429320e-03      1217       1466               148
## 3 query_1        TRUE 6.657958e-03       797       1466               103
## 4 query_1        TRUE 2.432283e-05     17069       1516              1430
## 5 query_1        TRUE 6.711700e-05     17004       1516              1422
## 6 query_1        TRUE 6.711700e-05      9699       1516               876
##    precision     recall    term_id source
## 1 0.09686221 0.12667261 GO:0071345  GO:BP
## 2 0.10095498 0.12161052 GO:0034097  GO:BP
## 3 0.07025921 0.12923463 GO:0019221  GO:BP
## 4 0.94327177 0.08377761 GO:0005623  GO:CC
## 5 0.93799472 0.08362738 GO:0044464  GO:CC
## 6 0.57783641 0.09031859 GO:0044446  GO:CC
##                                term_name effective_domain_size source_order
## 1 cellular response to cytokine stimulus                 17847        19858
## 2                   response to cytokine                 17847         9761
## 3    cytokine-mediated signaling pathway                 17847         6545
## 4                                   cell                 18842          300
## 5                              cell part                 18842         2403
## 6           intracellular organelle part                 18842         2386
##                              parents
## 1             GO:0034097, GO:0071310
## 2                         GO:0010033
## 3             GO:0007166, GO:0071345
## 4                         GO:0005575
## 5             GO:0005575, GO:0005623
## 6 GO:0043229, GO:0044424, GO:0044422
#Plot
gostplot(down_gpro, capped = TRUE, interactive = TRUE)
publish_gosttable(down_gpro, highlight_terms = down_gpro$result[c(1:10),],
                        use_colors = TRUE, 
                        show_columns = c("source", "term_name", "term_size", "intersection_size"),
                        filename = NULL)